library(blockmodels)
library(tidyverse)
library(huge)
library(QUIC)
library(stabs)
library(limma)
library(igraph)
covariates <- read_delim("Covariates.txt" , delim = "\t") %>% dplyr::select(-subjectidp1)
transcripts <- readRDS("Transcripts.rds") %>% as_tibble(rownames = "subjectidp")
Merge covariates and exposures and retain rows shared by all tables. Extract the city variable.
expr <- semi_join(transcripts, covariates, by = "subjectidp") %>% dplyr::select(-subjectidp)
city <- semi_join(covariates, transcripts, by = "subjectidp") %>% pull(city)
Regarding the sampling distribution of the cities,
table(city)
## city
## Basel Norwich Piscina Turin Utrect
## 67 39 33 76 62
we only keep Basel, Turin and Utrect for the example.
expr <- filter(expr, city %in% c("Basel", "Turin", "Utrect"))
city <- city[city %in% c("Basel", "Turin", "Utrect")]
A basic PCA indicates that the city variable has a strong effect:
library(FactoMineR)
pca_expr <- PCA(expr, graph = FALSE)
design <- model.matrix(~ 0 + city)
colnames(design) <- c("Basel", "Turin", "Utrect")
fit <- lmFit(t(expr), design)
contrast.matrix <- makeContrasts(
Basel - Turin ,
Basel - Utrect ,
Turin - Utrect ,
levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
topTable(fit2, coef=1, adjust="BH")
## logFC AveExpr t P.Value adj.P.Val
## A_24_P56689 4.232290 9.131715 31.25779 4.399099e-80 1.036296e-75
## A_33_P3282364 2.276584 5.065083 28.78363 4.429614e-74 5.217420e-70
## A_33_P3353906 3.355395 9.307358 28.70862 6.812310e-74 5.349253e-70
## A_33_P3379535 2.617897 4.983144 27.97332 4.799348e-72 2.758292e-68
## A_21_P0010155 2.613143 5.198017 27.93926 5.854505e-72 2.758292e-68
## A_23_P140527 4.061183 10.212512 27.79182 1.385923e-71 5.441365e-68
## A_23_P29851 -2.130031 9.717812 -27.18302 5.004018e-70 1.683995e-66
## A_19_P00332515 3.044734 6.771544 26.91223 2.503191e-69 7.370959e-66
## A_24_P118376 2.840383 6.244197 26.68394 9.795361e-69 2.563881e-65
## A_19_P00320614 2.465980 5.468292 26.37280 6.355291e-68 1.497116e-64
## B
## A_24_P56689 172.1123
## A_33_P3282364 158.3896
## A_33_P3353906 157.9620
## A_33_P3379535 153.7343
## A_21_P0010155 153.5368
## A_23_P140527 152.6804
## A_23_P29851 149.1154
## A_19_P00332515 147.5149
## A_24_P118376 146.1583
## A_19_P00320614 144.2988
results <- decideTests(fit2, p.value = 1e-5)
vennDiagram(results)
Still too many guys to perform network inference properly !!
We only keep the first 50 more variants guys among the differentially expressed transcripts.
expr_DF <- expr[, which(rowSums(abs(results)) == 3)]
expr_sub <- expr_DF[, order(apply(expr_DF, 2, var), decreasing = TRUE)[1:50]]
heatmap(as.matrix(expr_sub))
We split the data set into three part (one per city) and perform network inference
expr_city <- lapply(split(expr_sub, city), as.matrix)
networks <- Map(huge, expr_city)
## Conducting Meinshausen & Buhlmann graph estimation (mb)....done
## Conducting Meinshausen & Buhlmann graph estimation (mb)....done
## Conducting Meinshausen & Buhlmann graph estimation (mb)....done
lapply(networks, plot)
## $Basel
## NULL
##
## $Turin
## NULL
##
## $Utrect
## NULL
StARS_analysis <- lapply(networks, huge.select, criterion ="stars", verbose = FALSE)
empty <- lapply(StARS_analysis, plot)
A <- StARS_analysis[[2]]$refit
colnames(A) <- rownames(A) <- colnames(expr_sub)
G_Turin <- graph_from_adjacency_matrix(A, mode = "undirected")
plot(cluster_fast_greedy(G_Turin), G_Turin)
A <- StARS_analysis[[3]]$refit
colnames(A) <- rownames(A) <- colnames(expr_sub)
G_Utrect <- graph_from_adjacency_matrix(A, mode = "undirected")
plot(cluster_fast_greedy(G_Utrect), G_Utrect)
G_inter <- graph.intersection(G_Turin, G_Utrect)
plot(cluster_fast_greedy(G_inter), G_inter, main = "intersection")
G_diff <- graph.difference(G_Turin, G_Utrect)
plot(cluster_fast_greedy(G_diff), G_diff, main = "difference")
We try an alternative solution both for network inference and resampling process, using the graphical-Lasso (from the quick package) and the stability selection approach as implemented n the package stabs.
stab_out <- lapply(expr_city, stabsel, fitfun = "quic.graphical_model", cutoff = 0.75, PFER = 10)
par(mfrow=c(2,3))
null <- lapply(stab_out, plot, type = "paths", print.all = FALSE, labels = 1:50, main = "path")
null <- lapply(stab_out, plot, print.all = FALSE, labels = 1:50, main = "selection")
G_list <- lapply(stab_out, function(stabs) {
graph_from_edgelist(
do.call(rbind, strsplit(names(stabs$selected), " : ")),
directed = FALSE
)
})
plot(cluster_fast_greedy(G_list[[1]]), G_list[[1]], main = "Basel")
plot(cluster_fast_greedy(G_list[[2]]), G_list[[2]], main = "Turin")
plot(cluster_fast_greedy(G_list[[3]]), G_list[[3]], main = "Utrect")
G_inter <- graph.intersection(G_list[[1]], G_list[[2]])
plot(cluster_fast_greedy(G_inter), G_inter, main = "intersection Basel/Turin")
This remains quite exploratory ! Everything needs to be put in perspective with, say, the proteomic data set. So it is your turn!
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] blockmodels_1.1.1 digest_0.6.15 Rcpp_0.12.17
## [4] FactoMineR_1.41 bindrcpp_0.2.2 limma_3.34.6
## [7] stabs_0.6-3 QUIC_1.1 huge_1.2.7
## [10] MASS_7.3-50 igraph_1.2.1 lattice_0.20-35
## [13] Matrix_1.2-14 forcats_0.3.0 stringr_1.3.1
## [16] dplyr_0.7.5 purrr_0.2.5 readr_1.1.1
## [19] tidyr_0.8.1 tibble_1.4.2 ggplot2_2.2.1
## [22] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.4 reshape2_1.4.3 haven_1.1.1
## [4] colorspace_1.3-2 htmltools_0.3.6 yaml_2.1.19
## [7] rlang_0.2.1 pillar_1.2.3 foreign_0.8-70
## [10] glue_1.2.0 modelr_0.1.2 readxl_1.1.0
## [13] bindr_0.1.1 plyr_1.8.4 munsell_0.5.0
## [16] gtable_0.2.0 cellranger_1.1.0 rvest_0.3.2
## [19] codetools_0.2-15 leaps_3.0 psych_1.8.4
## [22] evaluate_0.10.1 knitr_1.20 broom_0.4.4
## [25] flashClust_1.01-2 scales_0.5.0 backports_1.1.2
## [28] scatterplot3d_0.3-41 jsonlite_1.5 mnormt_1.5-5
## [31] hms_0.4.2 stringi_1.2.3 grid_3.4.4
## [34] rprojroot_1.3-2 cli_1.0.0 tools_3.4.4
## [37] magrittr_1.5 lazyeval_0.2.1 cluster_2.0.7-1
## [40] crayon_1.3.4 pkgconfig_2.0.1 xml2_1.2.0
## [43] lubridate_1.7.4 assertthat_0.2.0 rmarkdown_1.10
## [46] httr_1.3.1 rstudioapi_0.7 R6_2.2.2
## [49] nlme_3.1-137 compiler_3.4.4